Polular open-source NLP library. It's used to perform:


In [2]:
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize


C:\anaconda\lib\site-packages\gensim\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [3]:
my_documents = ['The movie was about a spaceship and aliens.',
 'I really liked the movie!',
 'Awesome action scenes, but boring characters.',
 'The movie was awful! I hate alien films.',
 'Space is cool! I liked the movie.',
 'More space films, please!',]

In [6]:
tokenized_docs = [word_tokenize(doc.lower())
    for doc in my_documents]
dictionary = Dictionary(tokenized_docs)

In [11]:
# Token id
dictionary.token2id


Out[11]:
{'!': 12,
 ',': 16,
 '.': 8,
 'a': 4,
 'about': 3,
 'action': 14,
 'alien': 22,
 'aliens': 7,
 'and': 6,
 'awesome': 13,
 'awful': 20,
 'boring': 18,
 'but': 17,
 'characters': 19,
 'cool': 26,
 'films': 23,
 'hate': 21,
 'i': 9,
 'is': 25,
 'liked': 11,
 'more': 27,
 'movie': 1,
 'please': 28,
 'really': 10,
 'scenes': 15,
 'space': 24,
 'spaceship': 5,
 'the': 0,
 'was': 2}

In [12]:
dictionary[9]


Out[12]:
'i'

In [9]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
#The first is the id and second is frequency 
corpus


Out[9]:
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(0, 1), (1, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(0, 1),
  (1, 1),
  (2, 1),
  (8, 1),
  (9, 1),
  (12, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(0, 1), (1, 1), (8, 1), (9, 1), (11, 1), (12, 1), (24, 1), (25, 1), (26, 1)],
 [(12, 1), (16, 1), (23, 1), (24, 1), (27, 1), (28, 1)]]

In [14]:
doc=corpus[4]
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
bow_doc


Out[14]:
[(0, 1), (1, 1), (8, 1), (9, 1), (11, 1), (12, 1), (24, 1), (25, 1), (26, 1)]

In [16]:
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)


the 1
movie 1
. 1
i 1
liked 1

In [21]:
## Create the word freq of all the docs
from collections import defaultdict
from itertools import chain
totalfreq = defaultdict(int)
for word_id, freq in chain.from_iterable(corpus):
    totalfreq[word_id] += freq

In [23]:
sorted_freq = sorted(totalfreq.items(), key=lambda w: w[1], reverse=True)

In [24]:
for word_id, word_count in sorted_freq[:5]:
    print(dictionary.get(word_id), word_count)


the 4
movie 4
. 4
! 4
i 3